Tourism EDA - Zürich

1 Tourism EDA - Zurich

1.1 Univariable analysis

1.1.1 Load data

# Load the data in folder data named Dataset_tourism.xlsx)
tourism_data <- readxl::read_xlsx(here("data/Dataset_tourism.xlsx"))

1.1.2 Cleaning

#removing value 'Herkunftsland - Total' in column 'Herkunftsland' as it is just the total
tourism_data <- tourism_data %>% filter(Herkunftsland != "Herkunftsland - Total")
#print unique values in month column
unique(tourism_data$Monat)
#>  [1] "Januar"    "Februar"   "März"      "April"     "Mai"      
#>  [6] "Juni"      "Juli"      "August"    "September" "Oktober"  
#> [11] "November"  "Dezember"
# change ' [1] "Januar"    "Februar"   "März"      "April"     "Mai"       "Juni"      "Juli"      "August" "September" "Oktober"   "November"  "Dezember" into english month'
tourism_data$Monat <- tourism_data$Monat %>% recode_factor(
  "Januar" = "January",
  "Februar" = "February",
  "März" = "March",
  "April" = "April",
  "Mai" = "May",
  "Juni" = "June",
  "Juli" = "July",
  "August" = "August",
  "September" = "September",
  "Oktober" = "October",
  "November" = "November",
  "Dezember" = "December"
)
#add date type column for plotting purposes
tourism_data <- tourism_data %>% mutate(Date = dmy(paste("01", Monat, Jahr)))
#check for NAN
sum(is.na(tourism_data))
#> [1] 51395
#analyse the NAN values, where are they
(tourism_data %>% filter(is.na(value)))
#> # A tibble: 51,395 x 6
#>    Herkunftsland                  Kanton Monat Jahr  value Date      
#>    <chr>                          <chr>  <fct> <chr> <dbl> <date>    
#>  1 Malta                          Schwe~ Janu~ 2005     NA 2005-01-01
#>  2 Zypern                         Schwe~ Janu~ 2005     NA 2005-01-01
#>  3 Mexiko                         Schwe~ Janu~ 2005     NA 2005-01-01
#>  4 Übriges Zentralamerika, Karib~ Schwe~ Janu~ 2005     NA 2005-01-01
#>  5 Bahrain                        Schwe~ Janu~ 2005     NA 2005-01-01
#>  6 Katar                          Schwe~ Janu~ 2005     NA 2005-01-01
#>  7 Kuwait                         Schwe~ Janu~ 2005     NA 2005-01-01
#>  8 Australien                     Schwe~ Janu~ 2005     NA 2005-01-01
#>  9 Neuseeland, Ozeanien           Schwe~ Janu~ 2005     NA 2005-01-01
#> 10 Oman                           Schwe~ Janu~ 2005     NA 2005-01-01
#> # i 51,385 more rows
head(tourism_data)
#> # A tibble: 6 x 6
#>   Herkunftsland     Kanton  Monat   Jahr   value Date      
#>   <chr>             <chr>   <fct>   <chr>  <dbl> <date>    
#> 1 Schweiz           Schweiz January 2005  482820 2005-01-01
#> 2 Baltische Staaten Schweiz January 2005     758 2005-01-01
#> 3 Deutschland       Schweiz January 2005  135741 2005-01-01
#> 4 Frankreich        Schweiz January 2005   34248 2005-01-01
#> 5 Italien           Schweiz January 2005   34282 2005-01-01
#> 6 Österreich        Schweiz January 2005    9194 2005-01-01

1.1.2.1 Deal with NAN

1.1.2.1.1 Impute missing values ARIMA

If the missing values are random or if excluding them would result in a loss of valuable information, we might consider imputing them. One common approach is to use statistical models like ARIMA to interpolate missing values based on the patterns observed in the available data.

# #Creating a tsibble with missing values
# data <- tourism_data_zurich_philippines %>%
#   as_tsibble(key = c(Kanton, Herkunftsland, Monat, Jahr)) %>%
#   select(Date, value) %>%
#   fill_gaps()
# 
# # Fit an ARIMA model to data with missing values
# model_fit <- data %>%
#   model(ARIMA(value))
# 
# # Interpolate missing values using the fitted ARIMA model
# filled_data <- model_fit %>%
#   interpolate(data)
# 
# # Print the data with filled in missing values
# print(filled_data)

1.1.3 Country visiting zurich

#filter column 'Kanton' for Zurich
tourism_data_zurich <- tourism_data %>% filter(Kanton == "Zürich")
#check for NAN
sum(is.na(tourism_data_zurich))
#> [1] 1869
#analyse the NAN values, where are they
(tourism_data_zurich %>% filter(is.na(value)))
#> # A tibble: 1,869 x 6
#>    Herkunftsland                  Kanton Monat Jahr  value Date      
#>    <chr>                          <chr>  <fct> <chr> <dbl> <date>    
#>  1 Malta                          Zürich Janu~ 2005     NA 2005-01-01
#>  2 Zypern                         Zürich Janu~ 2005     NA 2005-01-01
#>  3 Mexiko                         Zürich Janu~ 2005     NA 2005-01-01
#>  4 Übriges Zentralamerika, Karib~ Zürich Janu~ 2005     NA 2005-01-01
#>  5 Bahrain                        Zürich Janu~ 2005     NA 2005-01-01
#>  6 Katar                          Zürich Janu~ 2005     NA 2005-01-01
#>  7 Kuwait                         Zürich Janu~ 2005     NA 2005-01-01
#>  8 Australien                     Zürich Janu~ 2005     NA 2005-01-01
#>  9 Neuseeland, Ozeanien           Zürich Janu~ 2005     NA 2005-01-01
#> 10 Oman                           Zürich Janu~ 2005     NA 2005-01-01
#> # i 1,859 more rows
head(tourism_data_zurich)
#> # A tibble: 6 x 6
#>   Herkunftsland     Kanton Monat   Jahr  value Date      
#>   <chr>             <chr>  <fct>   <chr> <dbl> <date>    
#> 1 Schweiz           Zürich January 2005  41094 2005-01-01
#> 2 Baltische Staaten Zürich January 2005    144 2005-01-01
#> 3 Deutschland       Zürich January 2005  22537 2005-01-01
#> 4 Frankreich        Zürich January 2005   3870 2005-01-01
#> 5 Italien           Zürich January 2005   3828 2005-01-01
#> 6 Österreich        Zürich January 2005   3006 2005-01-01

1.1.3.1 Plot time series

# Preparing the data
#removing value 'Schweiz' in column 'Herkunftsland' as it is just the whole of Switzerland
tourism_data_zurich <- tourism_data_zurich %>% filter(Herkunftsland != "Schweiz")
data <- tourism_data_zurich %>%
  filter(!is.na(value)) %>%  # Removing rows with NA values in the 'value' column
  mutate(Monat = month(Date, label = TRUE, abbr = TRUE),  # Extract month from Date
         Jahr = year(Date)) %>%  # Extract year from Date
  group_by(Herkunftsland, Date) %>%  # Group by country and date
  summarise(Trips = sum(value), .groups = 'drop')  # Summing up trips for each country per date

# Plotting
ggplot(data, aes(x = Date, y = Trips, group = Herkunftsland)) +
  geom_line(aes(color = Herkunftsland == "Philippinen"), show.legend = FALSE) +
  scale_color_manual(values = c("TRUE" = "red", "FALSE" = "grey")) +
  labs(title = "Number of Trips from Each Country to Zurich",
       x = "Date",
       y = "Number of Trips") +
  theme_minimal() +
  guides(color = guide_legend(title = "Country", override.aes = list(color = c("red"))))

# Static ggplot
p <- ggplot(data, aes(x = Date, y = Trips, group = Herkunftsland,
                      color = Herkunftsland == "Philippinen",
                      text = paste("Country:", Herkunftsland, "<br>Trips:", Trips))) +  # Added text for tooltip
  geom_line(show.legend = FALSE) +
  scale_color_manual(values = c("TRUE" = "red", "FALSE" = "grey")) +
  labs(title = "Number of Trips from Each Country to Zurich",
       x = "Date",
       y = "Number of Trips") +
  theme_minimal()

# Convert to an interactive plotly object
interactive_plot <- ggplotly(p, tooltip = "text")

# Adjust plotly settings 
interactive_plot <- interactive_plot %>%
  layout(margin = list(l = 60, r = 60, b = 60, t = 80),  # Adjust margins
         legend = list(orientation = "h", x = 0, xanchor = "left", y = -0.2))  # Adjust legend position

# Display the interactive plot
interactive_plot

1.1.4 Filter data for zurich and philipines

1.1.4.1 Pattern

1.1.4.1.1 Decompose
#filter visiting country philipines
tourism_data_zurich_philippines <- tourism_data_zurich %>% filter(Herkunftsland == "Philippinen")
head(tourism_data_zurich_philippines)
#> # A tibble: 6 x 6
#>   Herkunftsland Kanton Monat    Jahr  value Date      
#>   <chr>         <chr>  <fct>    <chr> <dbl> <date>    
#> 1 Philippinen   Zürich January  2005     57 2005-01-01
#> 2 Philippinen   Zürich February 2005     30 2005-02-01
#> 3 Philippinen   Zürich March    2005     46 2005-03-01
#> 4 Philippinen   Zürich April    2005     73 2005-04-01
#> 5 Philippinen   Zürich May      2005     74 2005-05-01
#> 6 Philippinen   Zürich June     2005     73 2005-06-01

# Convert data to a time series object
tourism_ts <- tourism_data_zurich_philippines %>%
  arrange(Date) %>%
  # Ensure data is complete and monthly
  complete(Date = seq.Date(min(Date), max(Date), by = "month")) %>%
  replace_na(list(value = 0)) %>%  # Replace NA values if there are any
  # Create a time series object
  with(ts(value, frequency = 12, start = decimal_date(min(Date))))

# Decompose the time series
decomposed <- decompose(tourism_ts)

# Plot the decomposed components
plot(decomposed)

1.1.4.1.2 Seasonality
# One chart
# several chart per month

1.2 Tourism EDA - Zurich with Philipine visitors

#filter tourism_data for Philipine visitors in Zurich and autoplot the time series with phili